#' ---
#' title: "Campaign Communication and Legislative Leadership (PSRM)"
#' subtitle: "07_keyness_and_wordfish.R"
#' author: "Authors: Stefan Mueller and Naofumi Fujimura"
#' date: "Note: Code compiled successfully on `r format(Sys.time(), '%d %B %Y')`"
#' ---


# load packages
library(quanteda)            # CRAN v3.3.1
library(quanteda.textmodels) # CRAN v0.9.6
library(quanteda.textstats)  # CRAN v0.96.3
library(xtable)              # CRAN v1.8-4
library(readr)               # CRAN v2.1.4
library(dplyr)               # CRAN v1.1.2
library(ggplot2)             # CRAN v3.4.2
library(stringr)             # CRAN v1.5.0

# If the code does not run, one or more packages may have been 
# updated, which may result in errors or conflicts. You can solve this issue
# by installing the package version listed above or by using the 
# groundhog package:
# after installing groundhog using install.packages("groundhog")
# change library(name_of_package) to
# groundhog::groundhog.library(name_of_package, date = "2024-01-31")
# Instead of adjusting the library() function for each package, 
# you can adjust them at all once using the
# the following syntax:
# groundhog.library("library('pkgA')
#                   library('pkgB')
#                   library('pkgC')", date = "2024-01-31")
# More details are available at: https://groundhogr.com/using/


# make sure to use quanteda package version 3.3.1 
# if you have issues reproducing the analysis
packageVersion("quanteda")

# make sure to use quanteda.textmodels package version 0.9.6
# if you have issues reproducing the analysis
packageVersion("quanteda.textmodels")

# make sure to use quanteda.textstats package version 0.96.3
# if you have issues reproducing the analysis
packageVersion("quanteda.textstats")

# print output of sessionInfo()
sessionInfo()

# load custom ggplot2 scheme
source("function_theme_base.R")

# load BERT-classified data
dat_predicted_raw <- readr::read_csv("data_predicted_full.csv")

# only keep manifestos included in the analysis (calculated in previous script)
dat_included <- read.csv("data_manifestos_included.csv",
                         fileEncoding = "utf-8")

cands_included <- unique(dat_included$filename)


# keep only sentences from candidates who are considered in the final analysis
dat_predicted <- filter(dat_predicted_raw, doc_id %in% cands_included)

# recode predictions (numbers) into policy areas labels
dat_predicted <- dat_predicted |> 
    mutate(predicted_class_bert = dplyr::recode(label,
                                                "0" = "Agriculture, Forestry, and Fisheries",
                                                "1" = "Committees on Cabinet",
                                                "2" = "Economy, Trade and Industry",
                                                "3" = "Education, Culture, Sports, Science, and Technology",
                                                "4" = "Environment",
                                                "5" = "Financial Affairs",
                                                "6" = "Foreign Affairs",
                                                "7" = "Health, Labor, and Welfare",
                                                "8" = "Internal Affairs and Communications",
                                                "9" = "Land, Infrastructure, Transport, and Tourism",
                                                "10" = "No policy area",
                                                "11" = "Security"
))



# count number and proportion of policy areas
dat_predicted_count <- dat_predicted |> 
    group_by(predicted_class_bert) |> 
    count() |> 
    ungroup() |> 
    mutate(prop = n / sum(n)) |> 
    mutate(predicted_class_bert = dplyr::recode(predicted_class_bert, "No policy area" = "No/Other Policy Area"))

# make sure proportions add up to 1
sum(dat_predicted_count$prop)

# Create Figure A5 
ggplot(dat_predicted_count, aes(x = prop, y = reorder(predicted_class_bert, prop))) +
    geom_bar(stat = "identity") +
    geom_text(aes(label = paste0(sprintf("%.1f%%; n=", round(100 *prop, 1)), str_trim(format(n, big.mark = ",")))), 
              nudge_x = 0.005, hjust = 0,
              colour = "grey50") +
    scale_y_discrete(labels = scales::label_wrap(40)) +
    scale_x_continuous(labels = scales::percent_format(accuracy = 1),
                       limits = c(0, 0.55),
                       breaks = c(seq(0, 0.5, 0.1))) +
    labs(x = "Percentage of Statements in Full Text Corpus", y = NULL)
ggsave("fig_a05.pdf",
       width = 9, height = 6)


# keyness analysis for policy areas

# count number and proportion of policy areas
table(dat_predicted$predicted_class_bert)
prop.table(table(dat_predicted$predicted_class_bert))

# create corpus and tokenize
toks <- dat_predicted |>
    rename(docid = doc_id) |> 
    corpus() |> 
    tokens(what = "word3") # use tokenizer from quanteda v3

# store as toks_refi to refine the tokenisation for Japanese text
# Note: we follow this tutorial: https://tutorials.quanteda.io/multilingual/japanese/

toks_refi <- toks 

tstat_kanji <- toks |>
    tokens_select('^[一-龠]+$', valuetype = 'regex', padding = TRUE) |>
    textstat_collocations(min_count = 5, tolower = FALSE) |> 
    as.data.frame() |> 
    subset(z > 2)

toks_refi <- tokens_compound(toks_refi, tstat_kanji,
                             concatenator = '', join = TRUE)

tstat_kana <- toks_refi |>
    tokens_select('^[ァ-ヶー]+$', valuetype = 'regex', padding = TRUE) |>
    textstat_collocations(min_count = 5, tolower = FALSE) |> 
    as.data.frame() |> 
    subset(z > 2)

toks_refi <- tokens_compound(toks_refi, tstat_kana,
                             concatenator = '', join = TRUE)

tstast_any <- toks_refi |>
    tokens_select('^[０-９ァ-ヶー一-龠]+$', valuetype = 'regex', padding = TRUE) |>
    textstat_collocations(min_count = 5, tolower = FALSE) |> 
    as.data.frame() |> 
    subset(z > 2)

toks_refi <- tokens_compound(toks_refi, tstast_any,
                             concatenator = '', join = TRUE)


toks_refi <- toks_refi |> 
    tokens(what = "word3",
           remove_numbers = TRUE, remove_symbols = TRUE,
           remove_punct = TRUE)

# get vector of all areas
areas <- unique(toks_refi$predicted_class_bert)

# conduct keyness analysis for each policy area

# empty data frame to store results
dat_keyness_words <- data.frame()

for (i in areas) {
    cat("Keyness analysis for", i, "\n")

    # group by policy area
    dfmat_keyness <- toks_refi |> 
        dfm() |>
        dfm_group(groups = toks_refi$predicted_class_bert)
    
    # run keyness analysis for area i
    keyness <- textstat_keyness(dfmat_keyness, target = i)
    
    # create variable indicating the category
    keyness$policy_area <- i
    
    # bind data frame
    dat_keyness_words <- bind_rows(keyness, dat_keyness_words)
}


# get the top-30 terms per measure
dat_keyness_words_max_30 <- dat_keyness_words |>
    group_by(policy_area) |>
    mutate(rank = 1:n()) |>
    filter(rank <= 30) |> 
    dplyr::select(feature, policy_area, rank)

# Note: this data frame
# was then saved as a csv file and Japanese words
# were translated into English

# load and wrangle translated data 
# (first automated using Google translated, afterwards manually checked and improved)
dat_keyness_translated_raw <- read_csv("data_keyness_jp_translated.csv")

dat_keyness_translated_raw <- dat_keyness_translated_raw |> 
    mutate(policy_area = str_replace_all(policy_area, "Labour", "Labor"))

# remove 7 features that cannot be translated/are not meaningful
dat_keyness_translated <- filter(dat_keyness_translated_raw, !str_detect(feature_english, "\\?"))

# check how many terms removed
nrow(dat_keyness_translated_raw) - nrow(dat_keyness_translated)

# bind terms for each measure (unit of observation is not the measure, rather than the term)
dat_keyness_words_max_30_tab <- dat_keyness_translated |>
    arrange(policy_area) |>
    group_by(policy_area) |>
    mutate(feature_english = str_squish(str_to_lower(feature_english))) |> # lower-case all terms
    mutate(feat_ja_en = paste0(feature, " (", feature_english, ")")) |> # combine Japanese and translated term
    summarise(Terms = paste(feat_ja_en, collapse = ", ")) |>
    ungroup() |>
    rename(`Policy Area` = policy_area)

# print table
dat_keyness_words_max_30_tab

# save as Table A4
print(xtable(dat_keyness_words_max_30_tab, digits = 0),
      file = "tab_a04.html",
      type = "html", include.rownames=FALSE)


# summarise statement-level data back to manifesto-level data
dat_manifestolevel_raw <- dat_predicted |> 
    group_by(election_year, doc_id) |> 
    summarise(text = paste(text, collapse = " "))

nrow(dat_manifestolevel_raw)

# # select only manifestos that are included in the analysis
dat_manifestolevel <- filter(dat_manifestolevel_raw, doc_id %in% cands_included)

# create text corpus
corp <- corpus(dat_manifestolevel)

# tokenize again by following same procedure as above
toks <- tokens(corp, what = "word3")

toks_refi <- toks

tstat_kanji <- toks |> 
    tokens_select('^[一-龠]+$', valuetype = 'regex', padding = TRUE) |> 
    textstat_collocations(min_count = 5, tolower = FALSE) |> 
    subset(z > 2)

toks_refi <- tokens_compound(toks_refi, tstat_kanji,
                             concatenator = '', join = TRUE)

tstat_kana <- toks_refi |> 
    tokens_select('^[ァ-ヶー]+$', valuetype = 'regex', padding = TRUE) |> 
    textstat_collocations(min_count = 5, tolower = FALSE) |> 
    subset(z > 2)

toks_refi <- tokens_compound(toks_refi, tstat_kanji,
                             concatenator = '', join = TRUE) 
tstast_any <- toks_refi |> 
    tokens_select('^[０-９ァ-ヶー一-龠]+$', valuetype = 'regex', padding = TRUE) |> 
    textstat_collocations(min_count = 5, tolower = FALSE) |> 
    subset(z > 2)

toks_refi <- tokens_compound(toks_refi, tstast_any,
                             concatenator = '', join = TRUE)

dfmat <- toks_refi |> 
    tokens(remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE) |> 
    tokens_remove(pattern = stopwords("jap", source = "marimo")) |> 
    dfm()

# store Wordfish estimates in empty data frame
tmod_estimates <- data.frame()

ndoc(dfmat)

# run wordfish for each year
for (i in unique(dfmat$election_year)) {
    
    cat("Wordfish model for", i, "\n")
    
    dfmat_year <- dfm_subset(dfmat, election_year == i) |> 
        dfm_trim(min_docfreq = 3, min_termfreq = 3)
    
    tmod_wf <- quanteda.textmodels::textmodel_wordfish(dfmat_year)
    
    tmod_wf_predict <- predict(tmod_wf, se.fit = TRUE)
    tmod_wf_predict_df <- as.data.frame(tmod_wf_predict)
    tmod_wf_predict_df$doc_id <- rownames(tmod_wf_predict_df)
    tmod_wf_predict_df$election_year <- i
    
    tmod_estimates <- bind_rows(tmod_estimates, tmod_wf_predict_df)
    
}

# number of estimates
nrow(tmod_estimates)

# give more meaningful variable names
tmod_estimates <- tmod_estimates |> 
    rename(wordfish_raw = fit,
           wordfish_raw_se = se.fit)


# prepare merging of Wordfish estimates with dataset
tmod_estimates$filename <- tmod_estimates$doc_id

# group by election year and get distance 
# between WF estimate and party's WF estimate
dat_wf <- tmod_estimates |> 
    group_by(election_year) |> 
    mutate(distance_wf_mean = wordfish_raw - mean(wordfish_raw, na.rm = TRUE)) |> 
    mutate(distance_abs_wf_mean = abs(distance_wf_mean))

# Load candidate-level dataset created in previous script

dat_cand <- readRDS("data_analysis_bert.rds")

# merge with dataset on positions and salience
dat_reg_wf <- left_join(dat_cand, dat_wf, by = "filename")

# ungroup dataset
dat_reg_wf <- ungroup(dat_reg_wf)

# reorder columns in data frame
dat_reg_wf_tidy <- dat_reg_wf |> 
    dplyr::select(-c(manifesto_id, portfolio_survey,
                     committee, manifesto_id,
                     prop_policyarea_bert, # exlucde since they have NA values
                     prop_policyarea_relevant_bert
                     )) |> # remove unused variables
    dplyr::select(filename, name, position_dummy, year, 
                  se_importance:distance_abs_wf_mean, 
                  ends_with("no_na"),
                  everything())


# store dataset for regression analysis
saveRDS(dat_reg_wf_tidy, "data_regression_bert_wf.rds")
